<?php
include_once '../settings.php'; 
//include_once APPROOT.'inc/simpleconfig.php';
include_once APPROOT.'common/MysqlDButil.php';
/**
 * 一个非常脆弱的，只应用于采集百度新闻的采集类
 * @author yuyue
 * time:2010/11/26
 *
 */
Class CollectNews{
	
	/**
	 * 入口函数
	 * 
	 */
	public function collectstart(){
	   $dbuitl = new DButil();
	   $sql = "select SORTNAME from sys_sort where CHAR_LENGTH(SORTID)=6 order by ID"; 
//	   $sql = "select ID, SORTID,SORTNAME,PARENT_ID from SYS_SORT";   
 		$result = $dbuitl->exequery($sql); 
		$nodes = array() ; 
		while ($row =   $dbuitl->fetch_array($result)){
			$node = array();
			foreach($row as $key => $value){
				$node[$key] = $value ;
		    }
		$nodes[] = $node ;
	   }
	   
	   //查询用户自定义关键字
	   $sql = "select distinct keyword as SORTNAME from news_keyword where type=1 order by id";
	   $result = $dbuitl->exequery($sql);
	   while ($row = $dbuitl->fetch_array($result)){
	   		$nodes[] = $row;
	   }

		   for($i=0;$i < sizeof($nodes);++$i){
		        CollectNews::collect(50,$nodes[$i]['SORTNAME']);
		   }

	}
	/**
	 * 
	 * 输入要采集的关键字
	 * @param $keyid
	 * @param $keyname
	 */	
//	public function collect($keyid =50,$keyname){ 
//		//不写关键字直接返回
//		if(!isset($keyname)){
//			return false;
//		}
//		  $news = array();
//	  //将中文转型成URL码
//	  $key=urlencode(iconv('utf-8', 'gb2312//IGNORE', $keyname));
//	  //此处的URL是按关键字查询最新时间排序搜索百度新闻，一共查找10条的URL
//	  $url = "http://news.baidu.com/ns?word=$key&sr=0&cl=2&rn=10&tn=newsA&ct=0&clk=sortbytime";
//	  //打开连接地址，取整个页面内容
//	  $isCollect = true;
//	  $fp = @fopen($url,"r") or   $isCollect=false;
//	  //有一些关键词是打不开的，类似中国共产党
//	  if($isCollect){
//	   $fcontent = file_get_contents($url);
//	   //一级一级把每条新闻做到一个数组里面
//	   eregi("<div id=\"r\"(.*)<div class=\"pg",$fcontent,$ab);
//       eregi("<table(.*)</table>",($ab[0]),$cont);
//       $word = explode ("</table><br>",$cont[0]);	
//       
//      
//       
//     //迭代数组  
//		foreach ($word as $abc) {
//	    //取到单条的新闻 
//		$xinwen = iconv("gbk", "utf-8//IGNORE", $abc);
//	    //分析新闻的URL,标题,简介
//	     eregi("<td class=\"text\"><a href=\"(.*) \"  mon",$xinwen,$dizi);
//	     eregi("<span><b>(.*)</b></span>",$xinwen,$title);
//	     eregi("<font size=-1>(.*)...</font>",$xinwen,$jianjie);
//	     
//	     $news[] = array("nhref"=>$dizi[1],"ntitle"=>$title[1],"nsummary"=>$jianjie[1]); 
//	   }	 
//	   CollectNews::saveNews($keyid,$keyname,$news);
//	  }
//	}
public function collect($keyid =50,$keyname){ 
	
	  $url = "http://news.baidu.com/ns?word=$keyid";
	  //打开连接地址，取整个页面内容
	  $isCollect = true;
	  $fp = @fopen($url,"r") or   $isCollect=false;
	  //有一些关键词是打不开的，类似中国共产党
	  if($isCollect){
	   $fcontent = file_get_contents($url);
	   //一级一级把每条新闻做到一个数组里面
	   eregi("<div id=\"r\"(.*)<div class=\"pg",$fcontent,$ab);
       eregi("<table(.*)</table>",($ab[0]),$cont);
       $word = explode ("</table><br>",$cont[0]);	
       
      
       
     //迭代数组  
		foreach ($word as $abc) {
	    //取到单条的新闻 
		$xinwen = iconv("gbk", "utf-8//IGNORE", $abc);
	    //分析新闻的URL,标题,简介
	     eregi("<td class=\"text\"><a href=\"(.*) \"  mon",$xinwen,$dizi);
	     eregi("<span><b>(.*)</b></span>",$xinwen,$title);
	     eregi("<font size=-1>(.*)...</font>",$xinwen,$jianjie);
	     
	     $news[] = array("nhref"=>$dizi[1],"ntitle"=>$title[1],"nsummary"=>$jianjie[1]); 
	   }	 
	   CollectNews::saveNews($keyid,$keyname,$news);
	  }
	}
	/**
	 * 保存一个行业下的十条新闻
	 * 同行更新一下采集状态表
	 * @param int $keyid
	 * @param array $news
	 */
   public function saveNews($keyid,$keyname,$news){ 
   	  $dbuitl = new DButil();
   	  //先把此行业的新闻全掉清掉
   	  $delsql = "delete from news where keyword='$keyname'"; 
   	  $dbuitl->exequery($delsql);
   	  $sql = "insert into news (keyword,title,news_url,orderby,created) values ";
       for($i=0;$i < sizeof($news);++$i){
       	 $title = $news[$i]['ntitle'];
       	 $url = $news[$i]['nhref']; 
       	 $sql = $sql."("."'$keyname'".",'$title','$url',$keyid,now()),";
   	  
   	  }
   	   	   
   	  
   	  $sql = substr($sql,0,strlen($sql)-1);
   	  $dbuitl->exequery($sql);
    	
   	  
   	  $sql2 = "select count(*) from news_keyword where keyword='$keyname'";
      $reuslt = $dbuitl->exequery($sql2);
      $row = $dbuitl->fetch_array($reuslt);
      
      if ($row[0] == 0){
      	$sql3 = "insert into news_keyword(user_id,keyword,is_collection,created, modified) values (100,'$keyname',1,now(),now())";
      }else {
      	 $sql3 = "update news_keyword set modified = now(),is_collection=1  where keyword='$keyname'";
      }
   	   $dbuitl->exequery($sql3);
   	  
   }
   
}